import pandas as pd
import numpy as np
import statistics #mathematical statistics
import matplotlib.pyplot as plt
import plotly.express as px #create entire figures
import seaborn as sns #data visuals
#html export
import plotly.io as pio
pio.renderers.default = 'notebook'
df_psy = pd.read_csv('psy_data_setEN.csv')
df_psy
| Person | age | married | orientation | religion | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 16 | Never | Hetero | Other | 1.0 | 7.0 | 7.0 | 7.0 | 7.0 | 5.0 | 1.0 | 4.0 | 2.0 | 4.0 | 4.0 |
| 1 | 2 | 16 | Never | New Types | Christian Other | 6.0 | 4.0 | 7.0 | 4.0 | 7.0 | 7.0 | 1.0 | 4.0 | 2.0 | 4.0 | 4.0 |
| 2 | 3 | 17 | Never | Homo | Christian | 2.0 | 2.0 | 2.0 | 6.0 | 5.0 | 5.0 | 3.0 | 3.0 | 4.0 | 3.0 | 4.0 |
| 3 | 4 | 13 | Never | New Types | Christian | 1.0 | 7.0 | 4.0 | 4.0 | 6.0 | 1.0 | 6.0 | 2.0 | 2.0 | 2.0 | 1.0 |
| 4 | 5 | 19 | Never | Hetero | Muslim | 2.0 | 3.0 | 6.0 | 5.0 | 5.0 | 6.0 | 3.0 | 2.0 | 3.0 | 4.0 | 4.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39774 | 39772 | 21 | Never | New Types | Muslim | 4.0 | 5.0 | 7.0 | 6.0 | 4.0 | 7.0 | 4.0 | 3.0 | 3.0 | 4.0 | 4.0 |
| 39775 | 39773 | 48 | Married | Hetero | Christian Other | 6.0 | 7.0 | 5.0 | 3.0 | 6.0 | 1.0 | 5.0 | 2.0 | 2.0 | 1.0 | 2.0 |
| 39776 | 39774 | 20 | Never | Hetero | Protestant | 1.0 | 5.0 | 7.0 | 5.0 | 3.0 | 5.0 | 3.0 | 3.0 | 2.0 | 4.0 | 4.0 |
| 39777 | 39775 | 26 | Never | Hetero | Muslim | 6.0 | 3.0 | 5.0 | 3.0 | 5.0 | 5.0 | 1.0 | 2.0 | 2.0 | 1.0 | 3.0 |
| 39778 | 39775 | 26 | Never | Hetero | Muslim | 6.0 | 3.0 | 5.0 | 3.0 | 5.0 | 5.0 | 1.0 | 2.0 | 2.0 | 1.0 | 3.0 |
39779 rows × 16 columns
Data Wrangling involves:
#dropping person column
df_psy = df_psy.drop(df_psy.columns[0], axis =1)
df_psy
| age | married | orientation | religion | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 16 | Never | Hetero | Other | 1.0 | 7.0 | 7.0 | 7.0 | 7.0 | 5.0 | 1.0 | 4.0 | 2.0 | 4.0 | 4.0 |
| 1 | 16 | Never | New Types | Christian Other | 6.0 | 4.0 | 7.0 | 4.0 | 7.0 | 7.0 | 1.0 | 4.0 | 2.0 | 4.0 | 4.0 |
| 2 | 17 | Never | Homo | Christian | 2.0 | 2.0 | 2.0 | 6.0 | 5.0 | 5.0 | 3.0 | 3.0 | 4.0 | 3.0 | 4.0 |
| 3 | 13 | Never | New Types | Christian | 1.0 | 7.0 | 4.0 | 4.0 | 6.0 | 1.0 | 6.0 | 2.0 | 2.0 | 2.0 | 1.0 |
| 4 | 19 | Never | Hetero | Muslim | 2.0 | 3.0 | 6.0 | 5.0 | 5.0 | 6.0 | 3.0 | 2.0 | 3.0 | 4.0 | 4.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39774 | 21 | Never | New Types | Muslim | 4.0 | 5.0 | 7.0 | 6.0 | 4.0 | 7.0 | 4.0 | 3.0 | 3.0 | 4.0 | 4.0 |
| 39775 | 48 | Married | Hetero | Christian Other | 6.0 | 7.0 | 5.0 | 3.0 | 6.0 | 1.0 | 5.0 | 2.0 | 2.0 | 1.0 | 2.0 |
| 39776 | 20 | Never | Hetero | Protestant | 1.0 | 5.0 | 7.0 | 5.0 | 3.0 | 5.0 | 3.0 | 3.0 | 2.0 | 4.0 | 4.0 |
| 39777 | 26 | Never | Hetero | Muslim | 6.0 | 3.0 | 5.0 | 3.0 | 5.0 | 5.0 | 1.0 | 2.0 | 2.0 | 1.0 | 3.0 |
| 39778 | 26 | Never | Hetero | Muslim | 6.0 | 3.0 | 5.0 | 3.0 | 5.0 | 5.0 | 1.0 | 2.0 | 2.0 | 1.0 | 3.0 |
39779 rows × 15 columns
#duplicated values
duplicates = df_psy.duplicated().sum()
duplicates
16
#Number of rows on data
len(df_psy)
39779
#Percentage of duplicated values
(duplicates/len(df_psy))*100
0.04022222780864275
<1% of data is duplicated hence safe to delete these duplicates
# duplicates
duplicates = df_psy[df_psy.duplicated]
duplicates
| age | married | orientation | religion | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5840 | 20 | Never | Hetero | Muslim | 4.0 | 7.0 | 1.0 | 5.0 | 6.0 | 1.0 | 6.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| 5958 | 19 | Never | New Types | Muslim | 5.0 | 6.0 | 7.0 | 7.0 | 7.0 | 7.0 | 4.0 | 4.0 | 3.0 | 2.0 | 4.0 |
| 8778 | 20 | Never | New Types | Muslim | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 2.0 | 2.0 |
| 13331 | 18 | Never | Hetero | Muslim | 1.0 | 1.0 | 7.0 | 7.0 | 7.0 | 7.0 | 1.0 | 4.0 | 4.0 | 4.0 | 4.0 |
| 19915 | 23 | Never | Hetero | Muslim | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1.0 | 1.0 | 2.0 |
| 23087 | 18 | Never | Hetero | Muslim | 1.0 | 1.0 | 7.0 | 7.0 | 7.0 | 7.0 | 1.0 | 4.0 | 4.0 | 4.0 | 4.0 |
| 23201 | 18 | Never | Hetero | Muslim | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1.0 | 1.0 | 1.0 |
| 24848 | 22 | Never | New Types | Muslim | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| 26074 | 28 | Never | Hetero | Muslim | 6.0 | 6.0 | 5.0 | 4.0 | 6.0 | 5.0 | 6.0 | 2.0 | 1.0 | 1.0 | 2.0 |
| 28382 | 18 | Never | New Types | Muslim | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 4.0 | 4.0 | 4.0 | 4.0 |
| 32458 | 23 | Never | Hetero | Muslim | 5.0 | 6.0 | 6.0 | 5.0 | 5.0 | 5.0 | 5.0 | 3.0 | 2.0 | 2.0 | 3.0 |
| 38016 | 21 | Never | New Types | Muslim | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 3.0 | 2.0 | 2.0 | 2.0 |
| 38951 | 19 | Never | Hetero | Muslim | 2.0 | 2.0 | 7.0 | 6.0 | 5.0 | 7.0 | 2.0 | 4.0 | 4.0 | 4.0 | 4.0 |
| 39148 | 18 | Never | New Types | Muslim | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1.0 | 1.0 | 1.0 |
| 39753 | 15 | Never | Hetero | Jewish | 3.0 | 6.0 | 6.0 | 6.0 | 3.0 | 2.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| 39778 | 26 | Never | Hetero | Muslim | 6.0 | 3.0 | 5.0 | 3.0 | 5.0 | 5.0 | 1.0 | 2.0 | 2.0 | 1.0 | 3.0 |
#dropping duplicates
df_psy = df_psy.drop_duplicates()
#Number of current duplicates
duplicates = sum(df_psy.duplicated())
duplicates
0
df_psy
| age | married | orientation | religion | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 16 | Never | Hetero | Other | 1.0 | 7.0 | 7.0 | 7.0 | 7.0 | 5.0 | 1.0 | 4.0 | 2.0 | 4.0 | 4.0 |
| 1 | 16 | Never | New Types | Christian Other | 6.0 | 4.0 | 7.0 | 4.0 | 7.0 | 7.0 | 1.0 | 4.0 | 2.0 | 4.0 | 4.0 |
| 2 | 17 | Never | Homo | Christian | 2.0 | 2.0 | 2.0 | 6.0 | 5.0 | 5.0 | 3.0 | 3.0 | 4.0 | 3.0 | 4.0 |
| 3 | 13 | Never | New Types | Christian | 1.0 | 7.0 | 4.0 | 4.0 | 6.0 | 1.0 | 6.0 | 2.0 | 2.0 | 2.0 | 1.0 |
| 4 | 19 | Never | Hetero | Muslim | 2.0 | 3.0 | 6.0 | 5.0 | 5.0 | 6.0 | 3.0 | 2.0 | 3.0 | 4.0 | 4.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39773 | 16 | Never | Asexual | Atheist | 2.0 | 3.0 | 5.0 | 5.0 | 5.0 | 3.0 | 3.0 | 2.0 | 3.0 | 3.0 | 4.0 |
| 39774 | 21 | Never | New Types | Muslim | 4.0 | 5.0 | 7.0 | 6.0 | 4.0 | 7.0 | 4.0 | 3.0 | 3.0 | 4.0 | 4.0 |
| 39775 | 48 | Married | Hetero | Christian Other | 6.0 | 7.0 | 5.0 | 3.0 | 6.0 | 1.0 | 5.0 | 2.0 | 2.0 | 1.0 | 2.0 |
| 39776 | 20 | Never | Hetero | Protestant | 1.0 | 5.0 | 7.0 | 5.0 | 3.0 | 5.0 | 3.0 | 3.0 | 2.0 | 4.0 | 4.0 |
| 39777 | 26 | Never | Hetero | Muslim | 6.0 | 3.0 | 5.0 | 3.0 | 5.0 | 5.0 | 1.0 | 2.0 | 2.0 | 1.0 | 3.0 |
39763 rows × 15 columns
There is a reduction in number of rows due to deletion of duplicated values
#info on the data
df_psy.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 39763 entries, 0 to 39777 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 39763 non-null int64 1 married 39763 non-null object 2 orientation 39762 non-null object 3 religion 39761 non-null object 4 Q1 39761 non-null float64 5 Q2 39758 non-null float64 6 Q3 39759 non-null float64 7 Q4 39761 non-null float64 8 Q5 39759 non-null float64 9 Q6 39759 non-null float64 10 Q7 39759 non-null float64 11 Q8 39758 non-null float64 12 Q9 39759 non-null float64 13 Q10 39760 non-null float64 14 Q11 39761 non-null float64 dtypes: float64(11), int64(1), object(3) memory usage: 4.9+ MB
# checking for null values
df_psy.isna()
| age | married | orientation | religion | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39773 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 39774 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 39775 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 39776 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 39777 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
39763 rows × 15 columns
df_psy[df_psy.isna().any(axis=1)]
| age | married | orientation | religion | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 90 | 15 | Never | New Types | NaN | 1.0 | NaN | NaN | 6.0 | NaN | 6.0 | NaN | NaN | NaN | 4.0 | 4.0 |
| 137 | 20 | Never | Hetero | Atheist | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 153 | 24 | Never | Homo | Hindu | 5.0 | NaN | 4.0 | 5.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 225 | 19 | Never | NaN | NaN | 2.0 | NaN | NaN | 6.0 | NaN | NaN | 3.0 | NaN | 2.0 | NaN | 3.0 |
| 257 | 15 | Never | Homo | Jewish | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4.0 | 2.0 |
1. df_psy.isna() returns a DataFrame of the same shape as df_psy, where each element is either True if the corresponding element in df_psy is NaN or null, or False otherwise. This operation checks for missing values in each element of the DataFrame.
2. .any(axis=1) is used to check if there are any True values across each row. By specifying axis=1, it checks for the presence of at least one True value in each row. The result is a boolean Series with the same number of rows as df_psy, where each element indicates whether the corresponding row has any missing values.
3. df_psy[df_psy.isna().any(axis=1)] is indexing df_psy using the boolean Series from the previous step. It selects only those rows from df_psy where the corresponding value in the boolean Series is True, meaning that row has at least one missing value.
The result is assigned to null, which will be another DataFrame containing only the rows from df_psy that have missing values in any of their elements.
null = df_psy[df_psy.isna().any(axis=1)]
#dropping null values
df_psy = df_psy.dropna()
df_psy
| age | married | orientation | religion | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 16 | Never | Hetero | Other | 1.0 | 7.0 | 7.0 | 7.0 | 7.0 | 5.0 | 1.0 | 4.0 | 2.0 | 4.0 | 4.0 |
| 1 | 16 | Never | New Types | Christian Other | 6.0 | 4.0 | 7.0 | 4.0 | 7.0 | 7.0 | 1.0 | 4.0 | 2.0 | 4.0 | 4.0 |
| 2 | 17 | Never | Homo | Christian | 2.0 | 2.0 | 2.0 | 6.0 | 5.0 | 5.0 | 3.0 | 3.0 | 4.0 | 3.0 | 4.0 |
| 3 | 13 | Never | New Types | Christian | 1.0 | 7.0 | 4.0 | 4.0 | 6.0 | 1.0 | 6.0 | 2.0 | 2.0 | 2.0 | 1.0 |
| 4 | 19 | Never | Hetero | Muslim | 2.0 | 3.0 | 6.0 | 5.0 | 5.0 | 6.0 | 3.0 | 2.0 | 3.0 | 4.0 | 4.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39773 | 16 | Never | Asexual | Atheist | 2.0 | 3.0 | 5.0 | 5.0 | 5.0 | 3.0 | 3.0 | 2.0 | 3.0 | 3.0 | 4.0 |
| 39774 | 21 | Never | New Types | Muslim | 4.0 | 5.0 | 7.0 | 6.0 | 4.0 | 7.0 | 4.0 | 3.0 | 3.0 | 4.0 | 4.0 |
| 39775 | 48 | Married | Hetero | Christian Other | 6.0 | 7.0 | 5.0 | 3.0 | 6.0 | 1.0 | 5.0 | 2.0 | 2.0 | 1.0 | 2.0 |
| 39776 | 20 | Never | Hetero | Protestant | 1.0 | 5.0 | 7.0 | 5.0 | 3.0 | 5.0 | 3.0 | 3.0 | 2.0 | 4.0 | 4.0 |
| 39777 | 26 | Never | Hetero | Muslim | 6.0 | 3.0 | 5.0 | 3.0 | 5.0 | 5.0 | 1.0 | 2.0 | 2.0 | 1.0 | 3.0 |
39758 rows × 15 columns
There is a reduction in number of rows due to deletion of null values from the Dataframe
OUTLIERS
Outliers are the extreme values within the dataset that deviates significantly from the rest of the (so-called normal)objects—either being much larger or significantly smaller.
Outliers can be detected using
#using boxplot to find outlier for Q8
var = df_psy['Q8']
plt.boxplot(var)
plt.show()
# using plotly express to check for outliers
px.box(var)
There are no outliers
px.box(var, points = 'all')# is using the Plotly Express library in Python to create a box plot with all data points displayed
Based on the box plot using Plotly:
df_psy
| age | married | orientation | religion | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 16 | Never | Hetero | Other | 1.0 | 7.0 | 7.0 | 7.0 | 7.0 | 5.0 | 1.0 | 4.0 | 2.0 | 4.0 | 4.0 |
| 1 | 16 | Never | New Types | Christian Other | 6.0 | 4.0 | 7.0 | 4.0 | 7.0 | 7.0 | 1.0 | 4.0 | 2.0 | 4.0 | 4.0 |
| 2 | 17 | Never | Homo | Christian | 2.0 | 2.0 | 2.0 | 6.0 | 5.0 | 5.0 | 3.0 | 3.0 | 4.0 | 3.0 | 4.0 |
| 3 | 13 | Never | New Types | Christian | 1.0 | 7.0 | 4.0 | 4.0 | 6.0 | 1.0 | 6.0 | 2.0 | 2.0 | 2.0 | 1.0 |
| 4 | 19 | Never | Hetero | Muslim | 2.0 | 3.0 | 6.0 | 5.0 | 5.0 | 6.0 | 3.0 | 2.0 | 3.0 | 4.0 | 4.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39773 | 16 | Never | Asexual | Atheist | 2.0 | 3.0 | 5.0 | 5.0 | 5.0 | 3.0 | 3.0 | 2.0 | 3.0 | 3.0 | 4.0 |
| 39774 | 21 | Never | New Types | Muslim | 4.0 | 5.0 | 7.0 | 6.0 | 4.0 | 7.0 | 4.0 | 3.0 | 3.0 | 4.0 | 4.0 |
| 39775 | 48 | Married | Hetero | Christian Other | 6.0 | 7.0 | 5.0 | 3.0 | 6.0 | 1.0 | 5.0 | 2.0 | 2.0 | 1.0 | 2.0 |
| 39776 | 20 | Never | Hetero | Protestant | 1.0 | 5.0 | 7.0 | 5.0 | 3.0 | 5.0 | 3.0 | 3.0 | 2.0 | 4.0 | 4.0 |
| 39777 | 26 | Never | Hetero | Muslim | 6.0 | 3.0 | 5.0 | 3.0 | 5.0 | 5.0 | 1.0 | 2.0 | 2.0 | 1.0 | 3.0 |
39758 rows × 15 columns
num = df_psy.drop(['married', 'orientation', 'religion'], axis=1)
fig = px.box(num.melt(),y = "value", facet_col='variable', boxmode= 'overlay', color = 'variable')#plotting outliers for all variables
fig.update_yaxes(matches = None)
Outliers are in age, Q4, Q5 & Q11
#finding maximum age
max_age = df_psy['age'].max()
max_age
1998
This age is impractical
max_age = df_psy[df_psy['age']>100] #finding age outliers
max_age
| age | married | orientation | religion | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5342 | 223 | Never | Hetero | Muslim | 7.0 | 6.0 | 3.0 | 2.0 | 1.0 | 2.0 | 6.0 | 2.0 | 1.0 | 1.0 | 1.0 |
| 10233 | 1996 | Never | New Types | Muslim | 5.0 | 7.0 | 7.0 | 7.0 | 5.0 | 3.0 | 5.0 | 4.0 | 4.0 | 3.0 | 4.0 |
| 14238 | 117 | Never | Homo | Christian Other | 3.0 | 1.0 | 7.0 | 6.0 | 3.0 | 6.0 | 2.0 | 4.0 | 3.0 | 4.0 | 3.0 |
| 21385 | 1998 | Never | Asexual | Muslim | 1.0 | 1.0 | 6.0 | 7.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1.0 | 2.0 | 2.0 |
| 24900 | 115 | Never | Homo | Christian | 5.0 | 3.0 | 6.0 | 6.0 | 5.0 | 6.0 | 2.0 | 3.0 | 2.0 | 1.0 | 2.0 |
| 30029 | 1993 | Married | Hetero | Muslim | 4.0 | 7.0 | 7.0 | 7.0 | 7.0 | 6.0 | 2.0 | 4.0 | 4.0 | 3.0 | 4.0 |
| 33732 | 1991 | Never | Hetero | Atheist | 2.0 | 7.0 | 7.0 | 5.0 | 6.0 | 6.0 | 2.0 | 3.0 | 3.0 | 3.0 | 4.0 |
df_psy = df_psy[df_psy['age']<100] #deletes the 7 rows which are age outliers
Q = df_psy[df_psy['Q4']>8] #finding outlier in Q4
Q
| age | married | orientation | religion | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 34616 | 21 | Never | Hetero | Atheist | 3.0 | 7.0 | 3.0 | 33.0 | 7.0 | 1.0 | 5.0 | 3.0 | 4.0 | 4.0 | 4.0 |
The outlier in Q4 is 33 as according to the Plotly Express box plot the min value is 1 and max value 7
#new rows for new dataframe
df_psy = df_psy[df_psy['Q4']<8]
df_psy = df_psy[df_psy['Q5']<8]
df_psy = df_psy[df_psy['Q4']<5]
#plotting numeric variables without outliers
num = df_psy.drop(['married', 'orientation', 'religion'], axis=1)
fig = px.box(num.melt(),y = "value", facet_col='variable', boxmode= 'overlay', color = 'variable')#plotting outliers for all variables
fig.update_yaxes(matches = None)
NON-NUMERIC VARIABLES
df_psy
| age | married | orientation | religion | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 16 | Never | New Types | Christian Other | 6.0 | 4.0 | 7.0 | 4.0 | 7.0 | 7.0 | 1.0 | 4.0 | 2.0 | 4.0 | 4.0 |
| 3 | 13 | Never | New Types | Christian | 1.0 | 7.0 | 4.0 | 4.0 | 6.0 | 1.0 | 6.0 | 2.0 | 2.0 | 2.0 | 1.0 |
| 6 | 17 | Never | Bi | Christian Other | 2.0 | 6.0 | 5.0 | 2.0 | 1.0 | 3.0 | 5.0 | 1.0 | 2.0 | 3.0 | 2.0 |
| 7 | 29 | Never | Bi | Atheist | 7.0 | 4.0 | 5.0 | 2.0 | 2.0 | 3.0 | 5.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| 10 | 15 | Never | Hetero | Protestant | 5.0 | 6.0 | 6.0 | 4.0 | 4.0 | 7.0 | 5.0 | 3.0 | 2.0 | 3.0 | 2.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39765 | 22 | Never | Hetero | Muslim | 1.0 | 7.0 | 2.0 | 4.0 | 6.0 | 1.0 | 4.0 | 1.0 | 1.0 | 2.0 | 3.0 |
| 39771 | 27 | Never | Hetero | Christian | 6.0 | 6.0 | 7.0 | 4.0 | 6.0 | 5.0 | 5.0 | 2.0 | 1.0 | 2.0 | 2.0 |
| 39772 | 16 | Never | Hetero | Agnostic | 6.0 | 4.0 | 5.0 | 3.0 | 5.0 | 2.0 | 5.0 | 3.0 | 2.0 | 2.0 | 4.0 |
| 39775 | 48 | Married | Hetero | Christian Other | 6.0 | 7.0 | 5.0 | 3.0 | 6.0 | 1.0 | 5.0 | 2.0 | 2.0 | 1.0 | 2.0 |
| 39777 | 26 | Never | Hetero | Muslim | 6.0 | 3.0 | 5.0 | 3.0 | 5.0 | 5.0 | 1.0 | 2.0 | 2.0 | 1.0 | 3.0 |
14313 rows × 15 columns
#Analyzing possible outliers in non-numeric variables making sure they are proper
df_psy.married.unique()
array(['Never', 'Married', 'Previously', 'Second'], dtype=object)
df_psy.orientation.unique()
array(['New Types', 'Bi', 'Hetero', 'Homo', 'Asexual'], dtype=object)
df_psy.religion.unique()
array(['Christian Other', 'Christian', 'Atheist', 'Protestant',
'Agnostic', 'Other', 'Jewish', 'Hindu', 'Buddhist', 'Muslim',
'Mormon', 'Sikh'], dtype=object)
#changing to a numerical variable using dictionary
matrimonial = {
'Never' : 1,
'Previously' : 2,
'Married' : 3,
'Second' : 4
}
df_psy['MarriedN'] = df_psy['married'].map(matrimonial)#builds a new column and maps to the dictionary
sexuality = {
'New Types' : 1,
'Bi' : 2,
'Hetero' : 3,
'Homo' : 4,
'Asexual' : 5
}
df_psy['OrientationN'] = df_psy['orientation'].map(sexuality)#builds a new column and maps to the dictionary
religia = {
'Christian Other' : 1,
'Christian' : 2,
'Atheist' : 3,
'Protestant' : 4,
'Agnostic' : 5,
'Other' : 6,
'Jewish' : 7,
'Hindu' : 8,
'Buddhist' : 9,
'Muslim': 10,
'Mormon' : 11,
'Sikh' : 12
}
df_psy['ReligiaN'] = df_psy['religion'].map(religia)#builds a new column and maps to the dictionary
df_psy
| age | married | orientation | religion | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | MarriedN | OrientationN | ReligiaN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 16 | Never | New Types | Christian Other | 6.0 | 4.0 | 7.0 | 4.0 | 7.0 | 7.0 | 1.0 | 4.0 | 2.0 | 4.0 | 4.0 | 1 | 1 | 1 |
| 3 | 13 | Never | New Types | Christian | 1.0 | 7.0 | 4.0 | 4.0 | 6.0 | 1.0 | 6.0 | 2.0 | 2.0 | 2.0 | 1.0 | 1 | 1 | 2 |
| 6 | 17 | Never | Bi | Christian Other | 2.0 | 6.0 | 5.0 | 2.0 | 1.0 | 3.0 | 5.0 | 1.0 | 2.0 | 3.0 | 2.0 | 1 | 2 | 1 |
| 7 | 29 | Never | Bi | Atheist | 7.0 | 4.0 | 5.0 | 2.0 | 2.0 | 3.0 | 5.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1 | 2 | 3 |
| 10 | 15 | Never | Hetero | Protestant | 5.0 | 6.0 | 6.0 | 4.0 | 4.0 | 7.0 | 5.0 | 3.0 | 2.0 | 3.0 | 2.0 | 1 | 3 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39765 | 22 | Never | Hetero | Muslim | 1.0 | 7.0 | 2.0 | 4.0 | 6.0 | 1.0 | 4.0 | 1.0 | 1.0 | 2.0 | 3.0 | 1 | 3 | 10 |
| 39771 | 27 | Never | Hetero | Christian | 6.0 | 6.0 | 7.0 | 4.0 | 6.0 | 5.0 | 5.0 | 2.0 | 1.0 | 2.0 | 2.0 | 1 | 3 | 2 |
| 39772 | 16 | Never | Hetero | Agnostic | 6.0 | 4.0 | 5.0 | 3.0 | 5.0 | 2.0 | 5.0 | 3.0 | 2.0 | 2.0 | 4.0 | 1 | 3 | 5 |
| 39775 | 48 | Married | Hetero | Christian Other | 6.0 | 7.0 | 5.0 | 3.0 | 6.0 | 1.0 | 5.0 | 2.0 | 2.0 | 1.0 | 2.0 | 3 | 3 | 1 |
| 39777 | 26 | Never | Hetero | Muslim | 6.0 | 3.0 | 5.0 | 3.0 | 5.0 | 5.0 | 1.0 | 2.0 | 2.0 | 1.0 | 3.0 | 1 | 3 | 10 |
14313 rows × 18 columns
# finding out the total of each religion in our dataset
df_psy['religion'].value_counts()
Muslim 8779 Christian 1123 Atheist 1023 Agnostic 941 Christian Other 787 Other 562 Protestant 523 Hindu 267 Buddhist 177 Jewish 59 Mormon 44 Sikh 28 Name: religion, dtype: int64
# plotting a plotly express pie chart for number of each religion
religion_counts = df_psy['religion'].value_counts()
fig = px.pie(names=religion_counts.index, values=religion_counts.values)
fig.update_layout(title='Religion Distribution')
fig.show()
# finding out the total of each sexual orientation value
df_psy['orientation'].value_counts()
Hetero 8926 New Types 2443 Bi 1630 Homo 668 Asexual 646 Name: orientation, dtype: int64
# plotting a plotly express bar graph for each sexual orientation
sexuality_counts = pd.Series(df_psy['orientation'].value_counts())
fig = px.bar(x=sexuality_counts.index, y=sexuality_counts, title='Sexuality count')
fig.update_layout(yaxis_title = 'People')
fig.show()